{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "mC0paOn8vrec"
   },
   "outputs": [],
   "source": [
    "! pip install SPARQLWrapper\n",
    "! pip install python-dateutil"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from SPARQLWrapper import SPARQLWrapper\n",
    "from rdflib import Graph\n",
    "import _pickle as cPickle\n",
    "import pandas as pd\n",
    "import rdflib\n",
    "import os\n",
    "import re\n",
    "os.environ['TF_CPP_MIN_LOG_LEVEL']='4'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dy_filename = \"./../DY-NB/dbp_yago.ttl\"\n",
    "prox_graph_file = \"./yago_pred_prox_graph\"\n",
    "graph = Graph()\n",
    "graph.parse(location=dy_filename, format='nt')\n",
    "print(\"len(graph):\", len(graph))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "aWkJj3NauxAl"
   },
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "\n",
    "def getRdfType(Q):\n",
    "    Q_types = []\n",
    "\n",
    "    queryString = \"\"\"\n",
    "    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>\n",
    "    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>\n",
    "    PREFIX dbr: <http://dbpedia.org/resource>\n",
    "    PREFIX dbo: <http://dbpedia.org/ontology>\n",
    "    SELECT DISTINCT ?obj WHERE{\n",
    "    \"\"\"+ Q +\"\"\" rdf:type ?obj\n",
    "    FILTER strstarts(str(?obj), str(dbo:))\n",
    "    }\n",
    "    \"\"\"\n",
    "\n",
    "    sparql = SPARQLWrapper(\"http://dbpedia.org/sparql\")\n",
    "\n",
    "    sparql.setQuery(queryString)  \n",
    "    sparql.setTimeout(1000)\n",
    "\n",
    "    sparql.setReturnFormat(\"json\")\n",
    "\n",
    "    try:\n",
    "        results = sparql.query().convert()\n",
    "        for result in results[\"results\"][\"bindings\"]:\n",
    "            Q_types.append(result[\"obj\"][\"value\"].replace(\"http://dbpedia.org/ontology/\",\"\"))\n",
    "        return Q_types\n",
    "    except TimeoutError:\n",
    "        return []\n",
    "\n",
    "def dataType(string):\n",
    "    odp='string'\n",
    "    patternBIT=re.compile('[01]')\n",
    "    patternINT=re.compile('[0-9]+')\n",
    "    patternFLOAT=re.compile('[0-9]+\\.[0-9]+')\n",
    "    patternTEXT=re.compile('[a-zA-Z0-9]+')\n",
    "    patternDate=re.compile('(\\d{4})-(\\d{2})-(\\d{2})')\n",
    "    if patternTEXT.match(string):\n",
    "        odp= \"string\"\n",
    "    if patternINT.match(string):\n",
    "        odp= \"integer\"\n",
    "    if patternFLOAT.match(string):\n",
    "        odp= \"float\"\n",
    "    if patternDate.match(string):\n",
    "        odp= \"date\"\n",
    "    return odp\n",
    "\n",
    "\n",
    "def getRDFData(o):\n",
    "    if str(o).startswith('http://dbpedia.org/resource/'):\n",
    "        Q_entity = \"<\"+o+\">\"\n",
    "        data_type = getRdfType(Q_entity)\n",
    "    else:\n",
    "        data_type = [dataType(o)]\n",
    "    \n",
    "    return o, data_type\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def add_to_set(types, typeset):\n",
    "    for t in types:\n",
    "        typeset.add(t)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "4M-Na-6eQEUf"
   },
   "outputs": [],
   "source": [
    "typeset1 = set()\n",
    "typeset2 = set()\n",
    "\n",
    "prox_graph = []\n",
    "i=0\n",
    "for s,p,o in graph:\n",
    "    i += 1\n",
    "    s, s_data_type = getRDFData(str(s)) # change data type\n",
    "    o, o_data_type = getRDFData(str(o))\n",
    "    \n",
    "    add_to_set(s_data_type, typeset1)\n",
    "    add_to_set(o_data_type, typeset2)\n",
    "    \n",
    "    prox_triple_list = [','.join(s_data_type), p, ','.join(o_data_type)]\n",
    "    prox_triple_string = '\\t'.join(prox_triple_list)\n",
    "\n",
    "    prox_graph.append(prox_triple_string)\n",
    "\n",
    "    if i % 1000 == 0:\n",
    "        with open(f\"{prox_graph_file}.txt\", 'a+') as f:\n",
    "            for prox_i in prox_graph:\n",
    "                f.write(str(prox_i))\n",
    "                f.write('\\n')\n",
    "        prox_graph = []\n",
    "        print(\"i: \", i)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('./typeset1.txt', 'w') as f:\n",
    "    f.write(','.join(list(typeset1)))\n",
    "with open('./typeset2.txt', 'w') as f:\n",
    "    f.write(','.join(list(typeset2)))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "colab": {
   "authorship_tag": "ABX9TyM8ypdK6jmqkiIicN5Fv1zZ",
   "mount_file_id": "1xsVCx02lcv_oOtjOtQ3uPuxuP6A4Adwz",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
